{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from lxml import etree\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import time\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_url(url):\n",
    "    # 输入链接，返回解析后的html\n",
    "    headers = {\n",
    "        \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36 Edg/86.0.622.63\"}\n",
    "    response = requests.get(url=url, headers=headers)\n",
    "    content = response.content.decode('utf-8', 'ignore')\n",
    "    html = etree.HTML(content)\n",
    "    return html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_base_info(page_url):\n",
    "    # 获取基础信息\n",
    "    html = parse_url(page_url)\n",
    "    titles = html.xpath(\n",
    "        '//div[@class=\"list-info\"]/h2[@class=\"title\"]/a/text()')  # 标题\n",
    "    urls = html.xpath(\n",
    "        '//div[@class=\"list-info\"]/h2[@class=\"title\"]/a/@href')  # 链接\n",
    "    total_prices = html.xpath(\n",
    "        '//div[@class=\"price\"]/p[@class=\"sum\"]/b/text()')  # 总价\n",
    "    unit_prices = html.xpath(\n",
    "        '//div[@class=\"price\"]/p[@class=\"unit\"]/text()')  # 均价\n",
    "    base_infos = []  # 使用一个列表存储所有信息\n",
    "    for title, url, total_price, unit_price in zip(titles, urls, total_prices, unit_prices):\n",
    "        # 将信息写入一个字典中\n",
    "        info = {}\n",
    "        info['标题'] = title\n",
    "        if url[0:5] != 'https':  # 有的链接不是https开头的，手动加上\n",
    "            url = 'https:'+url\n",
    "        info['链接'] = url.split('?')[0]  # 删掉链接后面跟的参数\n",
    "        info['总价'] = total_price\n",
    "        info['均价'] = unit_price\n",
    "        base_infos.append(info)\n",
    "    return base_infos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_extra_info(info):\n",
    "    # 进入详情页获取更多信息\n",
    "    info_url = info['链接']\n",
    "    html = parse_url(info_url)\n",
    "    try:\n",
    "        info['位置1'] = html.xpath(\n",
    "            '/html/body/div[4]/div[2]/div[2]/ul/li[2]/span[2]/a[1]/text()')[0].strip()\n",
    "    except:\n",
    "        info['位置1'] = ''\n",
    "    try:\n",
    "        info['位置2'] = html.xpath(\n",
    "            '/html/body/div[4]/div[2]/div[2]/ul/li[2]/span[2]/a[2]/text()')[0].replace('－', '').strip()\n",
    "    except:\n",
    "        info['位置2'] = ''\n",
    "    # 获取详情页表格中的信息\n",
    "    info_keys = html.xpath(\n",
    "        '//*[@id=\"generalSituation\"]//span[@class=\"mr_25 c_999\"]/text()')[1:]\n",
    "    info_values = html.xpath(\n",
    "        '//*[@id=\"generalSituation\"]//span[@class=\"c_000\"]')\n",
    "    info_values = [v.text for v in info_values]\n",
    "    for key, value in zip(info_keys, info_values):\n",
    "        info[key] = value\n",
    "\n",
    "    # 获取小区及周边信息\n",
    "    try:\n",
    "        info['小区名'] = html.xpath(\n",
    "            '//*[@id=\"xiaoWrap\"]/div/div[2]/h3/a/text()')[0].strip()\n",
    "    except:\n",
    "        info['小区名'] = ''\n",
    "    try:\n",
    "        info['小区均价'] = html.xpath(\n",
    "            '//*[@id=\"xiaoWrap\"]/div/div[2]/ul/li[1]/span[2]/text()')[0]\n",
    "    except:\n",
    "        info['小区均价'] = ''\n",
    "    try:\n",
    "        info['物业费'] = html.xpath(\n",
    "            '//*[@id=\"xiaoWrap\"]/div/div[2]/ul/li[3]/span[2]/text()')[0]\n",
    "    except:\n",
    "        info['物业费'] = ''\n",
    "    try:\n",
    "        info['容积率'] = html.xpath(\n",
    "            '//*[@id=\"xiaoWrap\"]/div/div[2]/ul/li[4]/span[2]/text()')[0]\n",
    "    except:\n",
    "        info['容积率'] = ''\n",
    "    try:\n",
    "        info['绿化率'] = html.xpath(\n",
    "            '//*[@id=\"xiaoWrap\"]/div/div[2]/ul/li[5]/span[2]/text()')[0]\n",
    "    except:\n",
    "        info['绿化率'] = ''\n",
    "    try:\n",
    "        info['车位信息'] = html.xpath(\n",
    "            '//*[@id=\"xiaoWrap\"]/div/div[2]/ul/li[6]/span[2]/text()')[0]\n",
    "\n",
    "    except:\n",
    "        info['车位信息'] = ''\n",
    "\n",
    "    return info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "base_url = 'https://xm.58.com/ershoufang/pn'\n",
    "infos = []\n",
    "for i in range(1, 7): # 爬取前六页\n",
    "    time.sleep(random.randint(10, 20))  # 设置休息时间应对反爬\n",
    "    page_url = base_url+str(i)\n",
    "    results = get_base_info(page_url)\n",
    "    infos.extend(results)\n",
    "    print(f'爬取页面{i}的基础信息成功！')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(0, len(infos)):\n",
    "    time.sleep(random.randint(10, 20))\n",
    "    infos[i] = get_extra_info(infos[i])\n",
    "    if infos[i]['位置1'] == '' and infos[i]['小区名'] == '':  # 如果这两个值都为空值，说明开始人机验证了\n",
    "        print(f'爬取第{i}条信息失败,请进行人机验证! ') # 点进去验证后出来记得该for循环的第一个值\n",
    "        print(infos[i]['链接'])\n",
    "        data = pd.DataFrame(infos)\n",
    "        data.to_csv('data2.csv')\n",
    "        break\n",
    "    else:\n",
    "        print(\"爬取第{}条信息成功：{}\".format(i, infos[i]['标题']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame(infos)\n",
    "data.to_csv('data2.csv')  # 导出到csv文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9-final"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}